VIBRANT Luminex QC - batch 1

Author

Laura Symul

Published

March 16, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)

tmp <- fs::dir_map("R Luminex/", source)

theme_set(theme_light())

Data

Loading Luminex data

Code
VIBRANT_dropbox_dir <-
  "/Users/laurasymul/Dropbox/Academia/Projects/VIBRANT Study Files/"

luminex_dir <- 
  str_c(VIBRANT_dropbox_dir, "15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING")
Code
plate_dirs <- list.files(luminex_dir, full.names = TRUE, pattern = "late [0-9]")

The data for the first 9 plates were uploaded to the VIBRANT Dropbox by Lenine Liebenberg in February 2025 in the following directories:

Code
plate_dirs |> str_remove(".*Projects/")
[1] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/Plate 1_potential rerun"
[2] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/Plate 2"                
[3] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 3"                
[4] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 4"                
[5] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 5"                
[6] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 6"                
[7] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 7"                
[8] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 8"                
[9] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 9"                

The data was made available as excel files. In these directories, we find the following files:

Code
files <- list.files(plate_dirs, full.names = FALSE, pattern = "xlsx$")
files
 [1] "03022025_SFT_LUM_VIBRANT_plate 8 unoptimized.xlsx" 
 [2] "20250120 VMRC VIBRANT SFT-LUM PLATE 1.xlsx"        
 [3] "20250120_SFT_LUM_VIBRANT_plate 1 un optimized.xlsx"
 [4] "20250120_SFT_LUM_VIBRANT_plate 1.xlsx"             
 [5] "20250122_SFT_LUM_VIBRANT_plate 2 un optimized.xlsx"
 [6] "20250122_SFT_LUM_VIBRANT_plate 2.xlsx"             
 [7] "20250123_SFT_LUM_VIBRANT_plate 3 un optimized.xlsx"
 [8] "20250123_SFT_LUM_VIBRANT_plate 3.xlsx"             
 [9] "20250124_SFT_LUM_VIBRANT_plate 4 un optimized.xlsx"
[10] "20250124_SFT_LUM_VIBRANT_plate 4.xlsx"             
[11] "20250129_SFT_LUM_VIBRANT_plate 5_UNOPTIMIZED.xlsx" 
[12] "20250129_SFT_LUM_VIBRANT_plate 5.xlsx"             
[13] "20250130_SFT_LUM_VIBRANT_plate 6 unoptimized.xlsx" 
[14] "20250130_SFT_LUM_VIBRANT_plate 6.xlsx"             
[15] "20250131_SFT_LUM_VIBRANT_plate 7 Unoptimized.xlsx" 
[16] "20250131_SFT_LUM_VIBRANT_plate 7.xlsx"             
[17] "20250203_SFT_LUM_VIBRANT_plate 8.xlsx"             
[18] "20250204_SFT_LUM_VIBRANT_plate 9 unoptimized.xlsx" 
[19] "20250204_SFT_LUM_VIBRANT_plate 9.xlsx"             
Caution

@Lenine: here, I’m assuming that I should only consider files that are of the form [date]_SFT_LUM_VIBRANT_plate [0-9]*.xlsx (i.e., not considering the “unoptimized” files). If this is not the case, please let me know :)

We load this data into R and store the excel sheets from the different plates in a single SummarizedExperiment object.

Code
raw <- 
  read_luminex_excels(
    dirs = plate_dirs,
    pattern = "SFT_LUM_VIBRANT_plate [0-9]*\\.xlsx$",
    name = "VIBRANT batch 1"
    ) # 
Reading data from 
    20250120_SFT_LUM_VIBRANT_plate 1.xlsx
Reading data from 
    20250122_SFT_LUM_VIBRANT_plate 2.xlsx
Reading data from 
    20250123_SFT_LUM_VIBRANT_plate 3.xlsx
Reading data from 
    20250124_SFT_LUM_VIBRANT_plate 4.xlsx
Reading data from 
    20250129_SFT_LUM_VIBRANT_plate 5.xlsx
Reading data from 
    20250130_SFT_LUM_VIBRANT_plate 6.xlsx
Reading data from 
    20250131_SFT_LUM_VIBRANT_plate 7.xlsx
Reading data from 
    20250203_SFT_LUM_VIBRANT_plate 8.xlsx
Reading data from 
    20250204_SFT_LUM_VIBRANT_plate 9.xlsx
Code
raw
# A SummarizedExperiment-tibble abstraction: 41,472 × 15
# Features=48 | Samples=864 | Assays=FI, FI_wo_background, raw_obs_conc,
#   exp_conc
   .feature       .sample     FI FI_wo_background raw_obs_conc exp_conc plate_nb
   <chr>          <chr>    <dbl>            <dbl> <chr>           <dbl>    <int>
 1 Hu b-NGF (46)  001_A1  18215            18199  *6907,81         6266        1
 2 Hu CTACK (72)  001_A1   5005             4993. 22098,91        31059        1
 3 Hu Eotaxin (4… 001_A1  14461            14407. *3256,16         3121        1
 4 Hu FGF basic … 001_A1  15876.           15862. <NA>            56685        1
 5 Hu G-CSF (57)  001_A1  15179            15164. 98623,61       104548        1
 6 Hu GM-CSF (34) 001_A1  21533            21511. *9584,69         7062        1
 7 Hu GRO-a (61)  001_A1   6564             6504. <NA>           837309        1
 8 Hu HGF (62)    001_A1  18560            18550  149685,67      177901        1
 9 Hu IFN-a2 (20) 001_A1  14173            14162. <NA>            24795        1
10 Hu IFN-g (21)  001_A1  14023            14008  *28781,91       27687        1
# ℹ 38 more rows
# ℹ 8 more variables: plate_name <chr>, plate_row <chr>, plate_col <fct>,
#   sample_type <fct>, type <chr>, description <chr>, sample_id <chr>,
#   dilution <dbl>

Plate design and sample names

Code
plot_plate_design(raw) 
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]

Caution

@Lenine: It looks like the sample names on plate 1 are different than the sample names on all other plates. I see that there is an additional excel file on the dropbox (Plate Layouts.xlsx) that provides both format. I assumed sample names should be harmonized across plates and used the format from plate 2+.

Caution

@Lenine: Sample on plate 4 well A5 has a weird name (LO[). On the excel file, it looks like it should be sample 6_14_23sa01 (SA_068-20-0153_V1_1000). Can you confirm?

Code
samples <- 
  raw |> 
  as_tibble() |> 
  select(.sample, sample_type, sample_id) |> 
  distinct()
Caution

@Lenine: There is also a small mismatch in the names of biological controls between different plates. I assumed that BIO CTRL is the same as BIOL CTRL. Let me know otherwise.

Code
samples |> filter(sample_id |> str_detect("CTRL")) |> dplyr::count(sample_id)
# A tibble: 3 × 2
  sample_id     n
  <chr>     <int>
1 BIO CTRL      2
2 BIOL CTRL    16
3 STD CTRL     18
Code
raw <- 
  raw |> 
  mutate(sample_id = sample_id |> str_replace_all("BIO CTRL", "BIOL CTRL"))
Code
raw <- 
  raw |> 
  mutate(
    sample_type = 
      case_when(
        str_detect(sample_id, " CTRL") ~ "Positive control", 
        TRUE ~ sample_type
        )
    )
Code
rm(samples)

Loading manifest data

We load the data contained in the first sheet (Plating) from the Plate Layouts.xlsx file. At the moment, we only consider the data in A2:X372.

Code
manifest <- 
  read_xlsx(
    str_c(luminex_dir, "/Plate Layouts.xlsx"), 
    sheet = "Plating", range = "A2:X372"
  )
Code
manifest_col <- c("Sample_Vcode", "shorthand title", "alternate title", "Location", "Round 1 Final dilution factor")

We subset and rename the column of interest (Sample_Vcode, shorthand title, alternate title, Location, Round 1 Final dilution factor):

Code
manifest <- 
  manifest |> 
  select(manifest_col) |> 
  dplyr::rename(
    sample_vcode = Sample_Vcode,
    shorthand_title = `shorthand title`,
    sample_id = `alternate title`,
    location = Location,
    dilution_manifest = `Round 1 Final dilution factor`
  )

manifest |> head() |> gt(caption = "Top rows of the manifest data after column selection and renaming.")
Top rows of the manifest data after column selection and renaming.
sample_vcode shorthand_title sample_id location dilution_manifest
068200008-1000 6_1_23sa01 SA_068-20-0008_V1_1000 SA 4.8
068200008-1100 6_1_23sa02 SA_068-20-0008_V2_1100 SA 7.2
068200008-1200 6_1_23sa03 SA_068-20-0008_V3_1200 SA 6.9
068200008-1300 6_1_23sa04 SA_068-20-0008_V4_1300 SA 4.8
068200008-1400 6_1_23sa05 SA_068-20-0008_V5_1400 SA 5.5
068200008-1500 6_1_23sa06 SA_068-20-0008_V6_1500 SA 4.9

We rename the plate 1 sample_id to match the actual (manifest) sample_id.

Code
samples <- 
  colData(raw) |> 
  as.data.frame() |> 
  rownames_to_column(".sample") |> 
  as_tibble()

samples <-
  samples |> 
  left_join(
    manifest |> 
      select(shorthand_title, sample_id) |> 
      dplyr::rename(sample_id_manifest = sample_id, sample_id = shorthand_title),
    by = join_by(sample_id)
  )

should_be_zero <- 
  samples |> 
  filter((plate_nb != 1) & !is.na(sample_id_manifest)) |> 
  nrow()

if (should_be_zero > 0) 
  stop("There are samples with a manifest sample_id that are not on plate 1")


samples <-
  samples |> 
  mutate(
    sample_id =
      case_when(
        !is.na(sample_id_manifest) ~ sample_id_manifest,
        TRUE ~ sample_id
      )
  )

We also include the location and dilution_manifest columns in the colData(raw) data.

Code
samples <- 
  samples |> 
  left_join(manifest, by = join_by(sample_id))

raw$sample_id <- samples$sample_id
raw$location <- samples$location
raw$dilution_manifest <- samples$dilution_manifest

rm(samples)

We compare the dilution factors from the excel file and the manifest:

Code
colData(raw) |> 
  as.data.frame() |> 
  rownames_to_column(".sample") |>
  as_tibble() |>
  ggplot() +
  aes(x = dilution, y = dilution_manifest, col = plate_name) +
  geom_abline(intercept = 0, slope = 1, col = "gray80") +
  geom_point(alpha = 0.5, size = 0.2) +
  coord_fixed() +
  labs(
    x = "Dilution factor from Luminex excel file\n(dilution sheet)", 
    y = "Dilution factor from manifest\nRound 1 Final dilution factor",
    color = "Plate"
    ) +
  facet_wrap(plate_name |> str_replace_all("_", " ") |> str_wrap(20) ~ ., nrow = 2) +
  guides(col = "none")

Caution

@Lenine: It looks like for most plates (except some values in plate 6 and 7?), the dilution factors have been rounded to the lowest integer. Is this correct? Should we use the actual value instead (the manifest value?). For now, I keep the values from the excel file, but let me know if we should use the manifest values instead.

We also add a pid (participant id) and a visit_nb that we extract from the sample_id:

Code
raw$pid <- 
  ifelse(
    str_detect(raw$sample_id, "^US|^SA"), 
    raw$sample_id |> str_remove("SA_068-20-|US_068-10-") |> str_remove("_V[0-9].*"), 
    NA
    )
raw$visit_nb <- 
  raw$sample_id |> str_extract("V[0-9]*") |> str_remove("V") |> as.integer()

Number of replicates per location, participants, and visits

Code
# colData(raw) |> 
#   as.data.frame() |> 
#   dplyr::count(location, pid, visit_nb) |> 
#   filter(!is.na(pid), !is.na(visit_nb)) |> 
#   pivot_wider(names_from = visit_nb, values_from = n, names_prefix = "V", values_fill = 0) |> 
#   gt(caption = "Number of samples per location, participant, and visit.")


colData(raw) |> 
  as.data.frame() |> 
  dplyr::count(location, pid, visit_nb) |> 
  filter(!is.na(pid), !is.na(visit_nb)) |>
  arrange(visit_nb) |> 
  mutate(
    visit = str_c("V", visit_nb) |> fct_inorder(),
    n = n |> factor()
    ) |>
  ggplot(aes(x = visit, y = pid, fill = n)) +
  geom_tile() +
  facet_wrap(. ~ location, scales = "free") +
  scale_fill_manual(name = "Number of samples", values = c("steelblue1", "steelblue3")) +
  ylab("Participant IDs")

Standard curves

Code
raw |> plot_standard_curves() 

Code
raw |> filter(.feature %in% "Hu IL-7 (74)") |> 
  plot_standard_curves() + facet_wrap(. ~ plate_name + .feature) + guides(col = "none") +
  ggtitle("Standard curves for Hu IL-7 (74)")

Dilution factors

Code
raw |> plot_dilution_factors()

Exclusion of failed analytes

We exclude the cytokines that do not have any values across all plates

Code
raw <- exclude_missing_analytes(raw)
VIBRANT batch 1 
     0 excluded analytes:   

Imputation of out-of-range values

Code
raw <- impute_OOR(raw)
VIBRANT batch 1
    Added 2 assays
        -`conc` with the imputed concentrations
        -`value_type` with the type of value (observed, extrapolated, below LLOQ, above ULOQ)
    and added @rowData with ULOQ, LLOQ, and median values for each analyte, as well as the mean concentrations across manufacturer's control replicates.

Overview of concentrations by analyte

Code
plot_conc_by_analyte(raw, color_by = "value_type")

Code
plot_conc_by_analyte(raw, color_by = "sample_type")

Code
plot_conc_by_analyte(raw, color_by = "plate_name")

Data transformations

We next explore which transformation (no transformation, log, asinh, or square root) of the data is most suitable to stabilize the variance of the data.

Code
check_transformation(raw, transf = "none") +
check_transformation(raw, transf = "asinh") +
check_transformation(raw, transf = "log") +
check_transformation(raw, transf = "sqrt")

As expected, the log transformation appears to be the most suitable for stabilizing the variance of the data.

Code
assay(raw, "conc_log2") <- raw |> assay("conc") |> log2()

Distributions & exploratory analyses

Marginal distributions

Code
plot_marginals <- function(raw, by = "plate_nb", binwidth = 1){
  tmp <- raw |> as_tibble()
  tmp <- tmp |> bind_rows(tmp |> mutate(.feature = "all analytes"))
  tmp$by <- str_c(by ," ", tmp[[by]])
  
  min_c <- min(tmp$conc_log2[!is.infinite(tmp$conc_log2)], na.rm = TRUE)
  max_c <- max(tmp$conc_log2[!is.infinite(tmp$conc_log2)], na.rm = TRUE)

  tmp |> 
    mutate(
      conc_log2_bin = 
        conc_log2 |> 
        cut(breaks = seq(min_c, max_c, by = binwidth))
    ) |> 
    dplyr::count(by, .feature, conc_log2_bin) |>
    group_by(by, .feature) |> 
    mutate(f = n/sum(n)) |> 
    ggplot( ) +
    aes(x = conc_log2_bin, y = by, fill = by, alpha = f) +
    geom_tile() +
    facet_wrap(.feature ~ ., scale = "free") +
    labs(
      x = "log2(concentration)\n(independent scales per analyte)",
      y = by
    ) +
    scale_x_discrete(breaks = NULL) +
    guides(fill = "none") +
    theme(strip.text.y = element_text(angle = 0))
  
}

Per plate

Code
plot_marginals(raw, by = "plate_nb")

Per location

Code
plot_marginals(raw, by = "location")

Per visit

Code
plot_marginals(raw, by = "visit_nb")

Standards and controls

Comparison of standards and controls concentrations across plates (x-axis) and analytes (facets).

Code
raw |> 
  filter(sample_type %in% c("Standard", "Positive control", "Blank", "Manuf. control")) |> 
  ggplot() +
  aes(x = plate_nb |> factor(), y = conc_log2, col = sample_id, shape = sample_type) +
  geom_point(size = 1, alpha = 0.5) +
  facet_wrap(.feature ~ ., nrow = 3) +
  theme(legend.position = "bottom") +
  xlab("Plate nb") +
  ylab("log2(concentration)")

Comparison of biological controls (colored dots) across plates (x-axis) and analytes (facets) (constrasted with the distribution of samples in light gray):

Code
raw |> 
  filter((sample_id == "BIOL CTRL") | (sample_type == "Sample")) |> 
  mutate(plate_ctrl = ifelse(sample_id == "BIOL CTRL", plate_nb, NA) |> factor()) |>
  ggplot() +
  aes(x = plate_nb |> factor(), y = conc_log2, col = plate_ctrl, alpha = is.na(plate_ctrl), shape = (value_type == "Observed concentration")) +
  geom_point(size = 1) +
  facet_wrap(. ~ .feature |> str_remove("Hu ") |> str_remove("\\([0-9]*\\)"), nrow = 3) + # , scales = "free"
  scale_x_discrete(breaks = NULL) +
  scale_alpha_manual(values = c(1, 0.1)) +
  xlab("Plate nb") +
  ylab("log2(concentration)") + 
  scale_color_discrete("Plate nb") # + theme(strip.text.x = element_text(angle = 90, hjust = 0)) 

PCA

Code
complete_samples <- 
  raw |> 
  as_tibble() |> 
  group_by(.sample) |>
  summarize(ok = !any(is.na(conc_log2))) |>
  mutate(i = row_number()) |>
  filter(ok)

complete <- raw[, complete_samples$.sample]
Code
pca_res <- 
  prcomp(complete |> assay("conc_log2") |> t(), center = TRUE, scale = TRUE)
Code
factoextra::fviz_eig(pca_res, ncp = 48)

As often with Luminex data, the first component explains most of the variance.

Excluding the 1st component, we can then see that the variance decreases rapidly.

Code
factoextra::fviz_eig(pca_res, ncp = 48) +
  geom_hline(yintercept = 100*1/48, linetype = 2) +
  ylim(c(0, 20))

Code
plot_pca_scores <- function(pca_res, raw, axes = 1:2, col_by = "sample_type"){
  scores <- 
    colData(raw) |> 
    as.data.frame() |> 
    rownames_to_column(".sample") |> 
    as_tibble() |> 
    left_join(
      as_tibble(pca_res$x) |> 
        mutate(.sample = rownames(pca_res$x)),
      by = join_by(.sample)
    )
  
  var <- factoextra::get_eig(pca_res)
  
  xvar <- str_c("PC", axes[1])
  yvar <- str_c("PC", axes[2])
  ggplot(scores) +
    aes(x = .data[[xvar]], y = .data[[yvar]], col = .data[[col_by]]) +
    geom_vline(xintercept = 0, col = "gray") +
    geom_hline(yintercept = 0, col = "gray") +
    geom_point(alpha = 0.8) +
    labs(
      x = str_c(xvar, " (", var$variance.percent[axes[1]] |> round(1), "%)"), 
      y = str_c(yvar, " (", var$variance.percent[axes[2]] |> round(1), "%)")
      ) +
    coord_fixed()
}

Checking the scores and biplots by sample type:

Code
g_12 <- plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "sample_type") 

g_23 <- plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "sample_type") 

g_45 <- plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "sample_type")

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Focus on controls and standards:

Code
complete <- 
  complete |> 
  mutate(control_id = ifelse(sample_type == "Sample", NA, sample_id))

g_12 <- plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "control_id") 

g_23 <- plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "control_id") 

g_45 <- plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "control_id")

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Focus on the biological controls across plates:

Code
complete <- 
  complete |> 
  mutate(plate_biol_ctrl = ifelse(sample_id == "BIOL CTRL", plate_name, NA))

g_12 <- 
  plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "plate_biol_ctrl") +
  scale_color_discrete(na.value = "gray90")

g_23 <- 
  plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "plate_biol_ctrl") +
  scale_color_discrete(na.value = "gray90")


g_45 <- 
  plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "plate_biol_ctrl") +
  scale_color_discrete(na.value = "gray90")

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Plate effects?

Code
g_12 <- plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "plate_name") 

g_23 <- plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "plate_name") 

g_45 <- plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "plate_name") 
  

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Plate effects when only considering samples (no controls or standards):

Code
g_12 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "plate_name") +
  stat_ellipse()

g_23 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "plate_name") +
  stat_ellipse()

g_45 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "plate_name") +
  stat_ellipse()
  

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Site differences?

Code
g_12 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "location") +
  stat_ellipse()

g_23 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "location") +
  stat_ellipse() 

g_45 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "location") +
  stat_ellipse()
  

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Visit differences?

Code
complete <- complete |> mutate(visit = str_c("V", visit_nb))

visit_colors <- 
  c(
    "V1" = "red", "V2" = "green3", 
    "V3" = "steelblue1", "V4" = "steelblue2", "V5" = "steelblue3", "V6" = "steelblue4"
  )
Code
g_12 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "visit") +
  stat_ellipse() +
  scale_color_manual(values = visit_colors) 

g_23 <- 
 plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "visit") +
  stat_ellipse() +
  scale_color_manual(values = visit_colors) 

g_45 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "visit") +
  stat_ellipse() +
  scale_color_manual(values = visit_colors) 
  

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Effect of the dilution factor?

Code
g_12 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "dilution") +
  scale_color_gradient(low = "gray80", high = "dodgerblue") 

g_23 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "dilution")  +
  scale_color_gradient(low = "gray80", high = "dodgerblue") 

g_45 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "dilution")  +
  scale_color_gradient(low = "gray80", high = "dodgerblue") 
  

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Code
g_12 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "dilution_manifest") +
  scale_color_gradient(low = "gray80", high = "dodgerblue") 

g_23 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "dilution_manifest")  +
  scale_color_gradient(low = "gray80", high = "dodgerblue") 

g_45 <- 
  plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "dilution_manifest")  +
  scale_color_gradient(low = "gray80", high = "dodgerblue") 
  

g_12 / (g_23 | g_45) +
  plot_layout(guides = "collect", heights = c(1, 1.5))

Code
scores_long <- 
    colData(raw) |> 
    as.data.frame() |> 
    rownames_to_column(".sample") |> 
    as_tibble() |> 
    inner_join(
      as_tibble(pca_res$x) |> 
        mutate(.sample = rownames(pca_res$x)) |> 
        pivot_longer(cols = -.sample, names_to = "PC", values_to = "value", names_prefix = "PC") |> 
        mutate(PC = PC |> as.integer()),
      by = join_by(.sample)
    )
Code
scores_long |> 
  filter(sample_type == "Sample", PC <= 6) |>
  ggplot(aes(x = dilution_manifest, y = value)) +
  geom_point(alpha = 0.5, size = 0.5) +
  geom_smooth(method = "lm", formula = "y ~ x") +
  facet_wrap(PC ~ ., scales = "free", labeller = label_both) +
  ylab("PC score") +
  xlab("Dilution factor from manifest")

Caution

It looks like the correction for the dilution is a little too aggressive (samples with high dilution factors have higher concentrations on average).

Marginal differences between replicates

Code
tmp <- 
  raw |> 
  as_tibble() |> 
  group_by(sample_id, .feature) |> 
  mutate(n = n()) |> 
  ungroup() |> 
  filter(n > 1, !is.na(conc_log2))

tmp <- 
  left_join(
    tmp |> 
      select(.feature, .sample, sample_id, sample_type, plate_nb, conc_log2) |> 
      dplyr::rename(.sample_r1 = .sample, plate_nb_r1 = plate_nb, conc_log2_r1 = conc_log2),
    tmp |> 
      select(.feature, .sample, sample_id, plate_nb, conc_log2) |> 
      dplyr::rename(.sample_r2 = .sample, plate_nb_r2 = plate_nb, conc_log2_r2 = conc_log2),
    by = join_by(.feature, sample_id),
    relationship = "many-to-many"
  ) |> 
  filter(.sample_r1 != .sample_r2)
Code
tmp <- 
  tmp |> 
  mutate(
    diff = conc_log2_r1 - conc_log2_r2,
    abs_diff = abs(diff),
    mean = (conc_log2_r1 + conc_log2_r2)/2,
    sd = sqrt(((conc_log2_r1 - mean)^2 + (conc_log2_r2 - mean)^2)),
    cv = sd/mean,
    replicate_type = ifelse(plate_nb_r1 == plate_nb_r2, "within plate", "across plate")
  )
Code
tmp |> 
  ggplot() +
  aes(x = diff, col = replicate_type) +
  geom_density(bw = 0.05) +
  facet_wrap(.feature ~ ., scales = "free") +
  scale_x_continuous(limits = c(-2.5, 2.5))

Code
tmp |> 
  filter(!is.infinite(diff)) |> 
  group_by(sample_type, plate_nb_r1, plate_nb_r2, replicate_type) |> 
  summarize(
    mean_abs_diff = mean(abs_diff),
    median_abs_diff = median(abs_diff),
    .groups = "drop"
    ) |> 
  ggplot() +
  aes(
    x = plate_nb_r1 |> factor(), 
    y = median_abs_diff, 
    col = plate_nb_r2 |> factor(),
    shape = replicate_type
    ) +
  geom_point() +
  facet_grid(sample_type ~ .) +
  xlab("Plate nb") +
  ylab("Median absolute difference between replicates") +
  scale_color_discrete("Replicate\nplate nb") + 
  scale_shape_discrete("Replicate type")

Code
tmp |> 
  # filter(sample_type == "Sample") |>
  ggplot() +
  aes(x = plate_nb_r2 |> factor(), y = abs_diff, col = replicate_type) +
  geom_boxplot(outlier.shape = NA) +
  ggbeeswarm::geom_quasirandom(size = 0.5, alpha = 0.1) +
  facet_grid(sample_type ~ plate_nb_r1, scales = "free", space = "free")